In [1]:
import os
path="C:\\Users\\tharu\\Downloads\\Uber\\"
In [2]:
os.listdir(path+'Datasets')
Out[2]:
['other-American_B01362.csv',
 'other-Carmel_B00256.csv',
 'other-Dial7_B00887.csv',
 'other-Diplo_B01196.csv',
 'other-Federal_02216.csv',
 'other-FHV-services_jan-aug-2015.csv',
 'other-Firstclass_B01536.csv',
 'other-Highclass_B01717.csv',
 'other-Lyft_B02510.csv',
 'other-Prestige_B01338.csv',
 'other-Skyline_B00111.csv',
 'Uber-Jan-Feb-FOIL.csv',
 'uber-raw-data-apr14.csv',
 'uber-raw-data-aug14.csv',
 'uber-raw-data-janjune-15.csv',
 'uber-raw-data-janjune-15_sample.csv',
 'uber-raw-data-jul14.csv',
 'uber-raw-data-jun14.csv',
 'uber-raw-data-may14.csv',
 'uber-raw-data-sep14.csv']
In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
In [4]:
uber=pd.read_csv(path+'Datasets'+'\\uber-raw-data-janjune-15_sample.csv')
In [5]:
uber.shape
Out[5]:
(100000, 4)
In [6]:
uber.columns
Out[6]:
Index(['Dispatching_base_num', 'Pickup_date', 'Affiliated_base_num',
       'locationID'],
      dtype='object')
In [7]:
uber.head(7)
Out[7]:
Dispatching_base_num Pickup_date Affiliated_base_num locationID
0 B02617 2015-05-02 21:43:00 B02764 237
1 B02682 2015-01-20 19:52:59 B02682 231
2 B02617 2015-03-19 20:26:00 B02617 161
3 B02764 2015-04-10 17:38:00 B02764 107
4 B02764 2015-03-23 07:03:00 B00111 140
5 B02617 2015-05-03 19:42:00 B02617 87
6 B02682 2015-01-14 20:21:50 B02764 125
In [8]:
uber.dtypes
Out[8]:
Dispatching_base_num    object
Pickup_date             object
Affiliated_base_num     object
locationID               int64
dtype: object
In [9]:
uber["Pickup_date"]=pd.to_datetime(uber["Pickup_date"])
In [10]:
uber.dtypes
Out[10]:
Dispatching_base_num            object
Pickup_date             datetime64[ns]
Affiliated_base_num             object
locationID                       int64
dtype: object
In [11]:
uber.isnull().sum()
Out[11]:
Dispatching_base_num       0
Pickup_date                0
Affiliated_base_num     1118
locationID                 0
dtype: int64

Here We can ignore the null values as we are main focusing on analyzing the montly highest pickup and hourly rush

In [12]:
uber.duplicated().sum()
Out[12]:
54
In [13]:
uber[uber.duplicated()==True].head(10)
Out[13]:
Dispatching_base_num Pickup_date Affiliated_base_num locationID
15345 B02682 2015-06-23 19:11:00 B02682 164
16424 B02682 2015-03-22 00:27:00 B02682 249
17934 B02764 2015-04-19 01:53:00 B02764 144
19410 B02682 2015-06-28 11:49:00 B02682 107
23936 B02682 2015-05-10 13:19:00 B02682 234
29417 B02764 2015-03-31 19:13:00 B02764 68
37104 B02682 2015-04-30 22:01:00 B02682 161
39065 B02682 2015-04-20 17:51:00 B02682 143
39152 B02764 2015-05-28 22:43:00 B02764 230
39305 B02682 2015-06-14 23:40:00 B02682 79

Similarly, We can ignore the duplicates as they are not exact duplicates

In [14]:
uber.size
Out[14]:
400000
In [15]:
uber["month"]=uber["Pickup_date"].dt.month_name()
uber["day"]=uber["Pickup_date"].dt.day_name()
uber["hour"]=uber["Pickup_date"].dt.hour
In [16]:
print(uber['month'].head())
print(uber["day"].head())
0        May
1    January
2      March
3      April
4      March
Name: month, dtype: object
0    Saturday
1     Tuesday
2    Thursday
3      Friday
4      Monday
Name: day, dtype: object
In [17]:
uber.dtypes
Out[17]:
Dispatching_base_num            object
Pickup_date             datetime64[ns]
Affiliated_base_num             object
locationID                       int64
month                           object
day                             object
hour                             int64
dtype: object
In [18]:
uber['month'].value_counts()
Out[18]:
June        19636
May         18667
April       15995
March       15979
February    15903
January     13820
Name: month, dtype: int64
In [19]:
uber.groupby(by=["month"],as_index=False).size()
Out[19]:
month size
0 April 15995
1 February 15903
2 January 13820
3 June 19636
4 March 15979
5 May 18667
In [20]:
uber_month_high=uber.groupby(by=["month"],axis=0,as_index=False).size().sort_values(by="size",ascending=False)
uber_month_high
Out[20]:
month size
3 June 19636
5 May 18667
0 April 15995
4 March 15979
1 February 15903
2 January 13820
In [21]:
import seaborn as sns
In [22]:
sns.barplot(x=uber_month_high["month"],y=uber_month_high["size"])
Out[22]:
<Axes: xlabel='month', ylabel='size'>
In [23]:
uber_month_day_high=uber.groupby(by=["month","day"],axis=0,as_index=False).size().sort_values(by="size",ascending=False)
sns.barplot(data=uber_month_day_high,x="month",y="size",hue="day")
Out[23]:
<Axes: xlabel='month', ylabel='size'>
In [24]:
uber_hour_high=uber.groupby(by=["day","hour"],axis=0,as_index=False).size().sort_values(by="size",ascending=False)
uber_hour_high
Out[24]:
day hour size
71 Saturday 23 1292
23 Friday 23 1211
19 Friday 19 1205
70 Saturday 22 1199
67 Saturday 19 1191
... ... ... ...
27 Monday 3 100
148 Wednesday 4 97
122 Tuesday 2 94
147 Wednesday 3 64
123 Tuesday 3 62

168 rows × 3 columns

In [25]:
sns.pointplot(x=uber_hour_high["hour"],y=uber_hour_high["size"],hue=uber_hour_high["day"])
Out[25]:
<Axes: xlabel='hour', ylabel='size'>
In [26]:
def period(x):
    if x>=6 and x<=12:
        return "Morning"
    elif x>12 and x<=17:
        return "Afternoon"
    elif x>17 and x<=23:
        return "Evening"
    else:
        return "Night"
uber["period"]=uber["hour"].apply(period)
  
        
    
In [27]:
uber["period"].value_counts()
Out[27]:
Evening      39302
Morning      24427
Afternoon    23679
Night        12592
Name: period, dtype: int64
In [28]:
period=pd.crosstab(uber["month"],uber["period"])
In [29]:
plt.figure(figsize=(8,3))
sns.heatmap(period, cmap="crest",annot=True,fmt=".0f")
Out[29]:
<Axes: xlabel='period', ylabel='month'>
In [30]:
uber1=pd.read_csv(path+'Datasets'+'\\Uber-Jan-Feb-FOIL.csv')
uber1.head()
Out[30]:
dispatching_base_number date active_vehicles trips
0 B02512 1/1/2015 190 1132
1 B02765 1/1/2015 225 1765
2 B02764 1/1/2015 3427 29421
3 B02682 1/1/2015 945 7679
4 B02617 1/1/2015 1228 9537
In [31]:
sns.boxplot(data=uber1,x="dispatching_base_number",y="active_vehicles")
Out[31]:
<Axes: xlabel='dispatching_base_number', ylabel='active_vehicles'>
In [32]:
!pip install plotly
Requirement already satisfied: plotly in c:\users\tharu\anaconda3\lib\site-packages (5.9.0)
Requirement already satisfied: tenacity>=6.2.0 in c:\users\tharu\anaconda3\lib\site-packages (from plotly) (8.2.2)
In [33]:
import plotly.express as px
px.box(uber1,y="dispatching_base_number",x="active_vehicles",points="all",color="dispatching_base_number")

Spatial Analytics¶

In [34]:
pip install folium
Requirement already satisfied: folium in c:\users\tharu\anaconda3\lib\site-packages (0.19.4)
Requirement already satisfied: branca>=0.6.0 in c:\users\tharu\anaconda3\lib\site-packages (from folium) (0.8.1)
Requirement already satisfied: jinja2>=2.9 in c:\users\tharu\anaconda3\lib\site-packages (from folium) (3.1.2)
Requirement already satisfied: numpy in c:\users\tharu\anaconda3\lib\site-packages (from folium) (1.24.3)
Requirement already satisfied: requests in c:\users\tharu\anaconda3\lib\site-packages (from folium) (2.31.0)
Requirement already satisfied: xyzservices in c:\users\tharu\anaconda3\lib\site-packages (from folium) (2022.9.0)
Requirement already satisfied: MarkupSafe>=2.0 in c:\users\tharu\anaconda3\lib\site-packages (from jinja2>=2.9->folium) (2.1.1)
Requirement already satisfied: charset-normalizer<4,>=2 in c:\users\tharu\anaconda3\lib\site-packages (from requests->folium) (2.0.4)
Requirement already satisfied: idna<4,>=2.5 in c:\users\tharu\anaconda3\lib\site-packages (from requests->folium) (3.4)
Requirement already satisfied: urllib3<3,>=1.21.1 in c:\users\tharu\anaconda3\lib\site-packages (from requests->folium) (1.26.16)
Requirement already satisfied: certifi>=2017.4.17 in c:\users\tharu\anaconda3\lib\site-packages (from requests->folium) (2024.2.2)
Note: you may need to restart the kernel to use updated packages.
In [35]:
import folium
from folium.plugins import HeatMap
In [36]:
l=os.listdir(path+'Datasets')
In [37]:
l
Out[37]:
['other-American_B01362.csv',
 'other-Carmel_B00256.csv',
 'other-Dial7_B00887.csv',
 'other-Diplo_B01196.csv',
 'other-Federal_02216.csv',
 'other-FHV-services_jan-aug-2015.csv',
 'other-Firstclass_B01536.csv',
 'other-Highclass_B01717.csv',
 'other-Lyft_B02510.csv',
 'other-Prestige_B01338.csv',
 'other-Skyline_B00111.csv',
 'Uber-Jan-Feb-FOIL.csv',
 'uber-raw-data-apr14.csv',
 'uber-raw-data-aug14.csv',
 'uber-raw-data-janjune-15.csv',
 'uber-raw-data-janjune-15_sample.csv',
 'uber-raw-data-jul14.csv',
 'uber-raw-data-jun14.csv',
 'uber-raw-data-may14.csv',
 'uber-raw-data-sep14.csv']
In [38]:
l=l[-8:]
l.remove('uber-raw-data-janjune-15.csv')
In [39]:
l.remove("uber-raw-data-janjune-15_sample.csv")
In [40]:
l
Out[40]:
['uber-raw-data-apr14.csv',
 'uber-raw-data-aug14.csv',
 'uber-raw-data-jul14.csv',
 'uber-raw-data-jun14.csv',
 'uber-raw-data-may14.csv',
 'uber-raw-data-sep14.csv']
In [41]:
uber1=pd.read_csv(path+'Datasets'+'\\uber-raw-data-apr14.csv')
uber1
Out[41]:
Date/Time Lat Lon Base
0 4/1/2014 0:11:00 40.7690 -73.9549 B02512
1 4/1/2014 0:17:00 40.7267 -74.0345 B02512
2 4/1/2014 0:21:00 40.7316 -73.9873 B02512
3 4/1/2014 0:28:00 40.7588 -73.9776 B02512
4 4/1/2014 0:33:00 40.7594 -73.9722 B02512
... ... ... ... ...
564511 4/30/2014 23:22:00 40.7640 -73.9744 B02764
564512 4/30/2014 23:26:00 40.7629 -73.9672 B02764
564513 4/30/2014 23:31:00 40.7443 -73.9889 B02764
564514 4/30/2014 23:32:00 40.6756 -73.9405 B02764
564515 4/30/2014 23:48:00 40.6880 -73.9608 B02764

564516 rows × 4 columns

In [42]:
uber_final=pd.DataFrame({'Date/Time':[],'Lat':[],'Lon':[],'Base':[]})
for i in l:
    uber2=pd.read_csv(path+"Datasets"+'\\'+i)
    uber_final=pd.concat([uber2,uber_final])
In [43]:
uber_final.shape
Out[43]:
(4534327, 4)
In [44]:
uber_rush=uber_final.groupby(["Lat","Lon"],as_index=False).size().sort_values(by="size",ascending=False)
uber_rush
Out[44]:
Lat Lon size
32881 40.6448 -73.7819 2299
432167 40.7685 -73.8625 2257
32880 40.6448 -73.7820 2079
33057 40.6449 -73.7822 1947
453734 40.7741 -73.8726 1921
... ... ... ...
226629 40.7232 -73.7992 1
226627 40.7232 -73.7996 1
226626 40.7232 -73.7997 1
226625 40.7232 -73.7999 1
574557 42.1166 -72.0666 1

574558 rows × 3 columns

In [45]:
m=folium.Map()
m
Out[45]:
Make this Notebook Trusted to load map: File -> Trust Notebook
In [46]:
HeatMap(uber_rush).add_to(m)
m
Out[46]:
Make this Notebook Trusted to load map: File -> Trust Notebook